In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
However, what will happen if we have higher order polynomials or other complex features. How will this impact the model prediction?
Test 1
Input Features: x, x^2, x^3, x^4
Output/Target: y_noisy
Objective: How does model behave when there are several additional features are orders of magnitude different in scale
Test 2
Input Features: x, x^2, x^3, x^4 - features normalized using AWS ML Transformation
Output/Target: y_noisy
Objective: How does normalization improve prediction accuracy
In [2]:
def quad_func (x):
return 5 * x ** 2 -23 * x + 47
In [3]:
# Training Set: 140 samples
# Eval Set: 60 samples
# Test Set: 60 samples
# Total: 260 samples
In [4]:
np.random.seed(5)
samples = 260
x_vals = pd.Series(np.random.rand(samples) * 20)
x2_vals = x_vals ** 2
x3_vals = x_vals ** 3
x4_vals = x_vals ** 4
y_vals = x_vals.map(quad_func)
y_noisy_vals = y_vals + np.random.randn(samples) * 50
In [5]:
df = pd.DataFrame(
{'x':x_vals,
'x2': x2_vals ,
'x3': x3_vals ,
'x4': x4_vals ,
'y':y_vals,
'y_noisy':y_noisy_vals})
In [6]:
df.head()
Out[6]:
In [8]:
#df.plot(x='x',y='y', grid=True, kind='scatter')
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df['x'], y = df['y'], color = 'r',label = 'y',)
plt.scatter(x = df['x'], y = df['y_noisy'], color='b', label = 'y noisy')
plt.scatter(x = df['x'], y = df['x2'], color = 'k', label = 'x^2')
plt.scatter(x = df['x'], y = df['x3'], color = 'g', label = 'x^3')
plt.scatter(x = df['x'], y = df['x4'], color = 'y', label = 'x^4')
plt.ylim((-500, 2000))
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.title('Higher Order Polynomial Features')
plt.grid(True)
plt.legend()
Out[8]:
In [9]:
df.corr()
Out[9]:
In [10]:
df.describe()
Out[10]:
In [11]:
data_path = '..\Data\RegressionExamples\quadratic_more_features'
In [13]:
df.to_csv(os.path.join(data_path, 'quadratic_more_features_example_all.csv'),
index = True,
index_label = 'Row')
In [14]:
df[df.index < 200].to_csv(os.path.join(data_path, 'quadratic_more_features_example_train.csv'),
index = True,
index_label = 'Row',
columns = ['x','x2','x3','x4','y_noisy'])
In [17]:
df.to_csv(os.path.join(data_path, 'quadratic_more_features_example_test_all.csv'),
index = True,
index_label = 'Row',
columns = ['x','x2','x3','x4'])
In [21]:
# Pull Predictions
df = pd.read_csv(os.path.join(data_path,'quadratic_more_features_example_all.csv'),
index_col = 'Row')
df_more_features_predicted = pd.read_csv(os.path.join(data_path,'output_more_features',
'bp-vYUTRecXupi-quadratic_more_features_example_test_all.csv.gz'))
df_more_features_predicted.columns = ["Row","y_predicted"]
In [22]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df['x'], y = df['y_noisy'], color = 'b', label = 'actual', )
plt.scatter(x = df['x'], y = df_more_features_predicted['y_predicted'],
color = 'g', label = 'Predicted before norm')
plt.ylim((-500, 2500))
plt.title('AWS ML - Higher Order Polynomial Features')
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.grid(True)
plt.legend()
Out[22]:
In [23]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy,
df_more_features_predicted.y_predicted],
labels = ['actual', 'predicted before norm'])
plt.title('Box Plot - Actual, Predicted')
plt.ylim((-500, 5000))
plt.ylabel('y')
plt.grid(True)
Training RMSE: 83973.66, Evaluation RMSE: 158260.62, Baseline RMSE: 437.31
x3, x4 are very large compared to rest of the features. These terms totally dominates the outcome.
In [24]:
# Pull normalized predictions
df_more_normalize_features_predicted = pd.read_csv(
os.path.join(data_path,'output_more_features_normalize',
'bp-zlJUq5GZtsS-quadratic_more_features_example_test_all.csv.gz'))
df_more_normalize_features_predicted.columns = ["Row","y_predicted"]
In [26]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df['x'], y = df['y_noisy'], color = 'b', label = 'actual', )
plt.scatter(x = df['x'], y = df_more_features_predicted['y_predicted'],
color = 'g', label = 'Prediction before norm')
plt.scatter(x = df['x'], y = df_more_normalize_features_predicted['y_predicted'],
color = 'r', label = 'Prediction after norm')
plt.ylim((-500, 2500))
plt.title('AWS ML - Higher Order Polynomial Features')
plt.grid(True)
plt.legend()
Out[26]:
In [28]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy,
df_more_features_predicted.y_predicted,
df_more_normalize_features_predicted.y_predicted],
labels=['actual','predicted before norm','predicted norm'])
plt.title('Box Plot - Actual, Predicted')
plt.ylim((-500,5000))
plt.ylabel('y')
plt.grid(True)